Kukutla Manohar
21BCE9466
VIT-AP
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("C:/Users/Manu/Downloads/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
df.describe()
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | ... | 1470.000000 | 1470.0 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| mean | 36.923810 | 802.485714 | 9.192517 | 2.912925 | 1.0 | 1024.865306 | 2.721769 | 65.891156 | 2.729932 | 2.063946 | ... | 2.712245 | 80.0 | 0.793878 | 11.279592 | 2.799320 | 2.761224 | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | 403.509100 | 8.106864 | 1.024165 | 0.0 | 602.024335 | 1.093082 | 20.329428 | 0.711561 | 1.106940 | ... | 1.081209 | 0.0 | 0.852077 | 7.780782 | 1.289271 | 0.706476 | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | 102.000000 | 1.000000 | 1.000000 | 1.0 | 1.000000 | 1.000000 | 30.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 80.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 465.000000 | 2.000000 | 2.000000 | 1.0 | 491.250000 | 2.000000 | 48.000000 | 2.000000 | 1.000000 | ... | 2.000000 | 80.0 | 0.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 802.000000 | 7.000000 | 3.000000 | 1.0 | 1020.500000 | 3.000000 | 66.000000 | 3.000000 | 2.000000 | ... | 3.000000 | 80.0 | 1.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 1157.000000 | 14.000000 | 4.000000 | 1.0 | 1555.750000 | 4.000000 | 83.750000 | 3.000000 | 3.000000 | ... | 4.000000 | 80.0 | 1.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 1499.000000 | 29.000000 | 5.000000 | 1.0 | 2068.000000 | 4.000000 | 100.000000 | 4.000000 | 5.000000 | ... | 4.000000 | 80.0 | 3.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
8 rows × 26 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
df.corr()
C:\Users\Manu\AppData\Local\Temp\ipykernel_13256\1134722465.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.corr()
| Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | 0.010661 | -0.001686 | 0.208034 | NaN | -0.010145 | 0.010146 | 0.024287 | 0.029820 | 0.509604 | ... | 0.053535 | NaN | 0.037510 | 0.680381 | -0.019621 | -0.021490 | 0.311309 | 0.212901 | 0.216513 | 0.202089 |
| DailyRate | 0.010661 | 1.000000 | -0.004985 | -0.016806 | NaN | -0.050990 | 0.018355 | 0.023381 | 0.046135 | 0.002966 | ... | 0.007846 | NaN | 0.042143 | 0.014515 | 0.002453 | -0.037848 | -0.034055 | 0.009932 | -0.033229 | -0.026363 |
| DistanceFromHome | -0.001686 | -0.004985 | 1.000000 | 0.021042 | NaN | 0.032916 | -0.016075 | 0.031131 | 0.008783 | 0.005303 | ... | 0.006557 | NaN | 0.044872 | 0.004628 | -0.036942 | -0.026556 | 0.009508 | 0.018845 | 0.010029 | 0.014406 |
| Education | 0.208034 | -0.016806 | 0.021042 | 1.000000 | NaN | 0.042070 | -0.027128 | 0.016775 | 0.042438 | 0.101589 | ... | -0.009118 | NaN | 0.018422 | 0.148280 | -0.025100 | 0.009819 | 0.069114 | 0.060236 | 0.054254 | 0.069065 |
| EmployeeCount | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EmployeeNumber | -0.010145 | -0.050990 | 0.032916 | 0.042070 | NaN | 1.000000 | 0.017621 | 0.035179 | -0.006888 | -0.018519 | ... | -0.069861 | NaN | 0.062227 | -0.014365 | 0.023603 | 0.010309 | -0.011240 | -0.008416 | -0.009019 | -0.009197 |
| EnvironmentSatisfaction | 0.010146 | 0.018355 | -0.016075 | -0.027128 | NaN | 0.017621 | 1.000000 | -0.049857 | -0.008278 | 0.001212 | ... | 0.007665 | NaN | 0.003432 | -0.002693 | -0.019359 | 0.027627 | 0.001458 | 0.018007 | 0.016194 | -0.004999 |
| HourlyRate | 0.024287 | 0.023381 | 0.031131 | 0.016775 | NaN | 0.035179 | -0.049857 | 1.000000 | 0.042861 | -0.027853 | ... | 0.001330 | NaN | 0.050263 | -0.002334 | -0.008548 | -0.004607 | -0.019582 | -0.024106 | -0.026716 | -0.020123 |
| JobInvolvement | 0.029820 | 0.046135 | 0.008783 | 0.042438 | NaN | -0.006888 | -0.008278 | 0.042861 | 1.000000 | -0.012630 | ... | 0.034297 | NaN | 0.021523 | -0.005533 | -0.015338 | -0.014617 | -0.021355 | 0.008717 | -0.024184 | 0.025976 |
| JobLevel | 0.509604 | 0.002966 | 0.005303 | 0.101589 | NaN | -0.018519 | 0.001212 | -0.027853 | -0.012630 | 1.000000 | ... | 0.021642 | NaN | 0.013984 | 0.782208 | -0.018191 | 0.037818 | 0.534739 | 0.389447 | 0.353885 | 0.375281 |
| JobSatisfaction | -0.004892 | 0.030571 | -0.003669 | -0.011296 | NaN | -0.046247 | -0.006784 | -0.071335 | -0.021476 | -0.001944 | ... | -0.012454 | NaN | 0.010690 | -0.020185 | -0.005779 | -0.019459 | -0.003803 | -0.002305 | -0.018214 | -0.027656 |
| MonthlyIncome | 0.497855 | 0.007707 | -0.017014 | 0.094961 | NaN | -0.014829 | -0.006259 | -0.015794 | -0.015271 | 0.950300 | ... | 0.025873 | NaN | 0.005408 | 0.772893 | -0.021736 | 0.030683 | 0.514285 | 0.363818 | 0.344978 | 0.344079 |
| MonthlyRate | 0.028051 | -0.032182 | 0.027473 | -0.026084 | NaN | 0.012648 | 0.037600 | -0.015297 | -0.016322 | 0.039563 | ... | -0.004085 | NaN | -0.034323 | 0.026442 | 0.001467 | 0.007963 | -0.023655 | -0.012815 | 0.001567 | -0.036746 |
| NumCompaniesWorked | 0.299635 | 0.038153 | -0.029251 | 0.126317 | NaN | -0.001251 | 0.012594 | 0.022157 | 0.015012 | 0.142501 | ... | 0.052733 | NaN | 0.030075 | 0.237639 | -0.066054 | -0.008366 | -0.118421 | -0.090754 | -0.036814 | -0.110319 |
| PercentSalaryHike | 0.003634 | 0.022704 | 0.040235 | -0.011111 | NaN | -0.012944 | -0.031701 | -0.009062 | -0.017205 | -0.034730 | ... | -0.040490 | NaN | 0.007528 | -0.020608 | -0.005221 | -0.003280 | -0.035991 | -0.001520 | -0.022154 | -0.011985 |
| PerformanceRating | 0.001904 | 0.000473 | 0.027110 | -0.024539 | NaN | -0.020359 | -0.029548 | -0.002172 | -0.029071 | -0.021222 | ... | -0.031351 | NaN | 0.003506 | 0.006744 | -0.015579 | 0.002572 | 0.003435 | 0.034986 | 0.017896 | 0.022827 |
| RelationshipSatisfaction | 0.053535 | 0.007846 | 0.006557 | -0.009118 | NaN | -0.069861 | 0.007665 | 0.001330 | 0.034297 | 0.021642 | ... | 1.000000 | NaN | -0.045952 | 0.024054 | 0.002497 | 0.019604 | 0.019367 | -0.015123 | 0.033493 | -0.000867 |
| StandardHours | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| StockOptionLevel | 0.037510 | 0.042143 | 0.044872 | 0.018422 | NaN | 0.062227 | 0.003432 | 0.050263 | 0.021523 | 0.013984 | ... | -0.045952 | NaN | 1.000000 | 0.010136 | 0.011274 | 0.004129 | 0.015058 | 0.050818 | 0.014352 | 0.024698 |
| TotalWorkingYears | 0.680381 | 0.014515 | 0.004628 | 0.148280 | NaN | -0.014365 | -0.002693 | -0.002334 | -0.005533 | 0.782208 | ... | 0.024054 | NaN | 0.010136 | 1.000000 | -0.035662 | 0.001008 | 0.628133 | 0.460365 | 0.404858 | 0.459188 |
| TrainingTimesLastYear | -0.019621 | 0.002453 | -0.036942 | -0.025100 | NaN | 0.023603 | -0.019359 | -0.008548 | -0.015338 | -0.018191 | ... | 0.002497 | NaN | 0.011274 | -0.035662 | 1.000000 | 0.028072 | 0.003569 | -0.005738 | -0.002067 | -0.004096 |
| WorkLifeBalance | -0.021490 | -0.037848 | -0.026556 | 0.009819 | NaN | 0.010309 | 0.027627 | -0.004607 | -0.014617 | 0.037818 | ... | 0.019604 | NaN | 0.004129 | 0.001008 | 0.028072 | 1.000000 | 0.012089 | 0.049856 | 0.008941 | 0.002759 |
| YearsAtCompany | 0.311309 | -0.034055 | 0.009508 | 0.069114 | NaN | -0.011240 | 0.001458 | -0.019582 | -0.021355 | 0.534739 | ... | 0.019367 | NaN | 0.015058 | 0.628133 | 0.003569 | 0.012089 | 1.000000 | 0.758754 | 0.618409 | 0.769212 |
| YearsInCurrentRole | 0.212901 | 0.009932 | 0.018845 | 0.060236 | NaN | -0.008416 | 0.018007 | -0.024106 | 0.008717 | 0.389447 | ... | -0.015123 | NaN | 0.050818 | 0.460365 | -0.005738 | 0.049856 | 0.758754 | 1.000000 | 0.548056 | 0.714365 |
| YearsSinceLastPromotion | 0.216513 | -0.033229 | 0.010029 | 0.054254 | NaN | -0.009019 | 0.016194 | -0.026716 | -0.024184 | 0.353885 | ... | 0.033493 | NaN | 0.014352 | 0.404858 | -0.002067 | 0.008941 | 0.618409 | 0.548056 | 1.000000 | 0.510224 |
| YearsWithCurrManager | 0.202089 | -0.026363 | 0.014406 | 0.069065 | NaN | -0.009197 | -0.004999 | -0.020123 | 0.025976 | 0.375281 | ... | -0.000867 | NaN | 0.024698 | 0.459188 | -0.004096 | 0.002759 | 0.769212 | 0.714365 | 0.510224 | 1.000000 |
26 rows × 26 columns
df.corr().DailyRate.sort_values(ascending=False)
C:\Users\Manu\AppData\Local\Temp\ipykernel_13256\2954484028.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.corr().DailyRate.sort_values(ascending=False)
DailyRate 1.000000 JobInvolvement 0.046135 StockOptionLevel 0.042143 NumCompaniesWorked 0.038153 JobSatisfaction 0.030571 HourlyRate 0.023381 PercentSalaryHike 0.022704 EnvironmentSatisfaction 0.018355 TotalWorkingYears 0.014515 Age 0.010661 YearsInCurrentRole 0.009932 RelationshipSatisfaction 0.007846 MonthlyIncome 0.007707 JobLevel 0.002966 TrainingTimesLastYear 0.002453 PerformanceRating 0.000473 DistanceFromHome -0.004985 Education -0.016806 YearsWithCurrManager -0.026363 MonthlyRate -0.032182 YearsSinceLastPromotion -0.033229 YearsAtCompany -0.034055 WorkLifeBalance -0.037848 EmployeeNumber -0.050990 EmployeeCount NaN StandardHours NaN Name: DailyRate, dtype: float64
df.isnull().any()
Age False Attrition False BusinessTravel False DailyRate False Department False DistanceFromHome False Education False EducationField False EmployeeCount False EmployeeNumber False EnvironmentSatisfaction False Gender False HourlyRate False JobInvolvement False JobLevel False JobRole False JobSatisfaction False MaritalStatus False MonthlyIncome False MonthlyRate False NumCompaniesWorked False Over18 False OverTime False PercentSalaryHike False PerformanceRating False RelationshipSatisfaction False StandardHours False StockOptionLevel False TotalWorkingYears False TrainingTimesLastYear False WorkLifeBalance False YearsAtCompany False YearsInCurrentRole False YearsSinceLastPromotion False YearsWithCurrManager False dtype: bool
df.isnull().sum()
Age 0 Attrition 0 BusinessTravel 0 DailyRate 0 Department 0 DistanceFromHome 0 Education 0 EducationField 0 EmployeeCount 0 EmployeeNumber 0 EnvironmentSatisfaction 0 Gender 0 HourlyRate 0 JobInvolvement 0 JobLevel 0 JobRole 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 MonthlyRate 0 NumCompaniesWorked 0 Over18 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 RelationshipSatisfaction 0 StandardHours 0 StockOptionLevel 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInCurrentRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
sns.histplot(df['DailyRate'], bins=20, kde=True)
plt.show()
plt.scatter(df["YearsAtCompany"],df["TotalWorkingYears"])
<matplotlib.collections.PathCollection at 0x1aab36a1110>
sns.heatmap(df.corr(),annot=True)
C:\Users\Manu\AppData\Local\Temp\ipykernel_13256\4277794465.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df.corr(),annot=True)
<Axes: >
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1aad8b31150>
gender_count = df['Gender'].value_counts()
sns.barplot(x=gender_count.index, y=gender_count.values)
plt.show()
sns.boxplot(df.DailyRate)
<Axes: >
sns.boxplot(df.DistanceFromHome)
<Axes: >
df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
x = df.drop('Attrition', axis=1)
x.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
x.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
x.shape
(1470, 34)
type(x)
pandas.core.frame.DataFrame
y = df['Attrition']
y.head()
0 Yes 1 No 2 Yes 3 No 4 No Name: Attrition, dtype: object
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
x["BusinessTravel"]=le.fit_transform(x["BusinessTravel"])
x["Department"]=le.fit_transform(x["Department"])
x["EducationField"]=le.fit_transform(x["EducationField"])
x["Gender"]=le.fit_transform(x["Gender"])
x["JobRole"]=le.fit_transform(x["JobRole"])
x["MaritalStatus"]=le.fit_transform(x["MaritalStatus"])
x["Over18"]=le.fit_transform(x["Over18"])
x["OverTime"]=le.fit_transform(x["OverTime"])
x.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
numeric_cols = [col for col in df.columns if col not in categorical_cols]
# Encode categorical columns using label encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
df[col] = label_encoder.fit_transform(df[col])
# Encode the target variable "Attrition"
df['Attrition'] = label_encoder.fit_transform(df['Attrition'])
# Now, the entire dataset is encoded, including categorical columns and the target variable
X = df.drop(columns=['Attrition'])
y = df['Attrition']
x.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
y.head()
0 1 1 0 2 1 3 0 4 0 Name: Attrition, dtype: int32
from sklearn.preprocessing import MinMaxScaler
ms=MinMaxScaler()
X_Scaled=ms.fit_transform(x)
X_Scaled=pd.DataFrame(ms.fit_transform(x),columns=x.columns)
X_Scaled.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.547619 | 1.0 | 0.715820 | 1.0 | 0.000000 | 0.25 | 0.2 | 0.0 | 0.000000 | 0.333333 | ... | 0.000000 | 0.0 | 0.000000 | 0.200 | 0.0 | 0.000000 | 0.15 | 0.222222 | 0.000000 | 0.294118 |
| 1 | 0.738095 | 0.5 | 0.126700 | 0.5 | 0.250000 | 0.00 | 0.2 | 0.0 | 0.000484 | 0.666667 | ... | 1.000000 | 0.0 | 0.333333 | 0.250 | 0.5 | 0.666667 | 0.25 | 0.388889 | 0.066667 | 0.411765 |
| 2 | 0.452381 | 1.0 | 0.909807 | 0.5 | 0.035714 | 0.25 | 0.8 | 0.0 | 0.001451 | 1.000000 | ... | 0.333333 | 0.0 | 0.000000 | 0.175 | 0.5 | 0.666667 | 0.00 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 0.357143 | 0.5 | 0.923407 | 0.5 | 0.071429 | 0.75 | 0.2 | 0.0 | 0.001935 | 1.000000 | ... | 0.666667 | 0.0 | 0.000000 | 0.200 | 0.5 | 0.666667 | 0.20 | 0.388889 | 0.200000 | 0.000000 |
| 4 | 0.214286 | 1.0 | 0.350036 | 0.5 | 0.035714 | 0.00 | 0.6 | 0.0 | 0.002903 | 0.000000 | ... | 1.000000 | 0.0 | 0.333333 | 0.150 | 0.5 | 0.666667 | 0.05 | 0.111111 | 0.133333 | 0.117647 |
5 rows × 34 columns
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_Scaled,y,test_size =0.2,random_state =0)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
(1176, 34) (294, 34) (1176,) (294,)
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train,y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
pred=model.predict(x_test)
pred
array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0])
y_test
442 0
1091 0
981 1
785 0
1332 1
..
1439 0
481 0
124 1
198 0
1229 0
Name: Attrition, Length: 294, dtype: int32
df
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 36 | 0 | 1 | 884 | 1 | 23 | 2 | 3 | 1 | 2061 | ... | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 39 | 0 | 2 | 613 | 1 | 6 | 1 | 3 | 1 | 2062 | ... | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 27 | 0 | 2 | 155 | 1 | 4 | 3 | 1 | 1 | 2064 | ... | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 49 | 0 | 1 | 1023 | 2 | 2 | 3 | 3 | 1 | 2065 | ... | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 34 | 0 | 2 | 628 | 1 | 8 | 3 | 3 | 1 | 2068 | ... | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 35 columns
x.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
model.predict(ms.transform([[ 41,2,1102,2,1,2,1,1,1,2,0,94,3,2,7,4,2,5993,19479,8,0,1,11,3,1,80,0,8,0,1,6,4,0,5]]))
C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but MinMaxScaler was fitted with feature names warnings.warn( C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names warnings.warn(
array([1])
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
accuracy_score(y_test,pred)
0.8843537414965986
confusion_matrix(y_test,pred)
array([[242, 3],
[ 31, 18]], dtype=int64)
pd.crosstab(y_test,pred)
| col_0 | 0 | 1 |
|---|---|---|
| Attrition | ||
| 0 | 242 | 3 |
| 1 | 31 | 18 |
a1=(242+18)/294 #accuracy
a1
0.8843537414965986
r=18/(18+31) #recall
r
0.3673469387755102
p=18/(3+18) #precision
p
0.8571428571428571
f1=2*p*r/(p+r) #F1 score
f1
0.5142857142857143
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.89 0.99 0.93 245
1 0.86 0.37 0.51 49
accuracy 0.88 294
macro avg 0.87 0.68 0.72 294
weighted avg 0.88 0.88 0.86 294
probability=model.predict_proba(x_test)[:,1]
probability
array([0.16000127, 0.20600667, 0.31532384, 0.09242886, 0.63667551,
0.06153061, 0.61819432, 0.0757087 , 0.00841372, 0.3912069 ,
0.05398439, 0.33293123, 0.02020698, 0.67215483, 0.19786547,
0.03454902, 0.11043981, 0.17101703, 0.04477777, 0.22783614,
0.2335018 , 0.01553905, 0.06464492, 0.05029956, 0.58792413,
0.44849464, 0.07412714, 0.04460935, 0.67666632, 0.0584383 ,
0.01599026, 0.03521098, 0.06963085, 0.17397462, 0.07830857,
0.04288032, 0.08150424, 0.07106342, 0.03622137, 0.05223965,
0.04862098, 0.02091497, 0.01819361, 0.01362467, 0.02873997,
0.50236969, 0.41553218, 0.00306874, 0.73976412, 0.51382382,
0.09637213, 0.48845516, 0.08036228, 0.25757243, 0.66516772,
0.26308027, 0.01964858, 0.30198497, 0.02919946, 0.16038964,
0.02102747, 0.21670232, 0.13981568, 0.0358316 , 0.37208403,
0.03002317, 0.29091186, 0.16041142, 0.10437497, 0.08695177,
0.08217589, 0.30984518, 0.08531362, 0.07420689, 0.12268651,
0.06192552, 0.04640904, 0.07624712, 0.19738483, 0.03236316,
0.00884439, 0.0244108 , 0.13635803, 0.0260104 , 0.03341008,
0.08186888, 0.00499397, 0.03474852, 0.03858027, 0.14602694,
0.26167665, 0.16667357, 0.27400109, 0.24159565, 0.02160421,
0.17748606, 0.34076078, 0.28022482, 0.06914126, 0.05003806,
0.24437761, 0.74698271, 0.35438567, 0.01920627, 0.08778845,
0.03255847, 0.05461351, 0.15123251, 0.06843702, 0.13752637,
0.09584388, 0.04669882, 0.02493091, 0.15383171, 0.07081259,
0.03089296, 0.0537667 , 0.11554316, 0.00881616, 0.01263271,
0.17552253, 0.05045234, 0.08823238, 0.82995757, 0.03017756,
0.0236819 , 0.0087012 , 0.1349589 , 0.16474801, 0.05202613,
0.01524549, 0.29278083, 0.54767448, 0.34275448, 0.04629541,
0.38966344, 0.61333366, 0.14552367, 0.07402366, 0.24143471,
0.09418418, 0.0689069 , 0.10061956, 0.19346327, 0.20026293,
0.03004939, 0.14900424, 0.00348846, 0.11225149, 0.15843155,
0.06047573, 0.18601882, 0.06085869, 0.12221317, 0.03280184,
0.02738799, 0.06356425, 0.08302382, 0.01541716, 0.014665 ,
0.38517822, 0.01264231, 0.14961974, 0.80508787, 0.11598661,
0.2842811 , 0.17020143, 0.1530583 , 0.02764153, 0.00613226,
0.04191632, 0.09782393, 0.11551417, 0.10377982, 0.01779313,
0.14371315, 0.10615435, 0.10298963, 0.05132621, 0.09061081,
0.02897383, 0.09924087, 0.00512032, 0.75108423, 0.04296968,
0.04062134, 0.37518972, 0.04563128, 0.7251816 , 0.10671665,
0.36949086, 0.38146941, 0.32095493, 0.05266802, 0.08172004,
0.13947833, 0.04334317, 0.01469593, 0.26413988, 0.06330966,
0.1614747 , 0.15380517, 0.67152357, 0.05840793, 0.27891823,
0.04512564, 0.46033865, 0.00348431, 0.14068967, 0.02747401,
0.12714133, 0.17284246, 0.07341066, 0.10099827, 0.16870885,
0.02560842, 0.01824031, 0.08670796, 0.02834237, 0.13710215,
0.08778935, 0.2200061 , 0.73401148, 0.15938978, 0.4095449 ,
0.01513845, 0.11306309, 0.21497506, 0.32337575, 0.03409266,
0.04256318, 0.32157531, 0.05454465, 0.02348479, 0.16423352,
0.32696147, 0.22892063, 0.00877159, 0.08198819, 0.01156361,
0.1408691 , 0.29235147, 0.01270305, 0.17329916, 0.04081391,
0.04094165, 0.42771425, 0.34958286, 0.03766772, 0.12025286,
0.37698923, 0.3192629 , 0.79559338, 0.05385659, 0.21597037,
0.06383728, 0.00570991, 0.66018187, 0.35855286, 0.37783606,
0.36781398, 0.03554512, 0.21718203, 0.05943622, 0.06554485,
0.10081475, 0.00818713, 0.26591316, 0.42809675, 0.06542835,
0.09296803, 0.01259826, 0.14226651, 0.05072662, 0.02372258,
0.02586923, 0.06760427, 0.24315648, 0.26961432, 0.19831733,
0.2652296 , 0.0165923 , 0.15784236, 0.08398982, 0.02711775,
0.18750547, 0.00783535, 0.2844239 , 0.00270742, 0.02484969,
0.22585745, 0.72775605, 0.07691547, 0.26304359])
fpr,tpr,threshsholds = roc_curve(y_test,probability)
plt.plot(fpr,tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC CURVE')
plt.show()
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(x_train,y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
pred=dtc.predict(x_test)
pred
array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
0, 0, 1, 0, 0, 0, 0, 0])
y_test
442 0
1091 0
981 1
785 0
1332 1
..
1439 0
481 0
124 1
198 0
1229 0
Name: Attrition, Length: 294, dtype: int32
df
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 36 | 0 | 1 | 884 | 1 | 23 | 2 | 3 | 1 | 2061 | ... | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 39 | 0 | 2 | 613 | 1 | 6 | 1 | 3 | 1 | 2062 | ... | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 27 | 0 | 2 | 155 | 1 | 4 | 3 | 1 | 1 | 2064 | ... | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 49 | 0 | 1 | 1023 | 2 | 2 | 3 | 3 | 1 | 2065 | ... | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 34 | 0 | 2 | 628 | 1 | 8 | 3 | 3 | 1 | 2068 | ... | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 35 columns
dtc.predict(ms.transform([[41,2,1102,2,1,2,1,1,1,2,0,94,3,2,7,4,2,5993,19479,8,0,1,11,3,1,80,0,8,0,1,6,4,0,5]]))
C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but MinMaxScaler was fitted with feature names warnings.warn( C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names warnings.warn(
array([1])
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
accuracy_score(y_test,pred)
0.7517006802721088
confusion_matrix(y_test,pred)
array([[205, 40],
[ 33, 16]], dtype=int64)
pd.crosstab(y_test,pred)
| col_0 | 0 | 1 |
|---|---|---|
| Attrition | ||
| 0 | 205 | 40 |
| 1 | 33 | 16 |
a2 = (209+18)/294 #accuarcy
a2
0.7721088435374149
r2 = 18/(31+18) #recall
r2
0.3673469387755102
p2 = 18/(36+18) #precision
p2
0.3333333333333333
f2 = 2*p2*r2/(p2+r2) # F1 score
f2
0.34951456310679613
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.86 0.84 0.85 245
1 0.29 0.33 0.30 49
accuracy 0.75 294
macro avg 0.57 0.58 0.58 294
weighted avg 0.77 0.75 0.76 294
probability=dtc.predict_proba(x_test)[:,1]
probability
array([0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0.,
0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1.,
0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1.,
0., 0., 0., 0., 0.])
fpr,tpr,threshsholds = roc_curve(y_test,probability)
plt.plot(fpr,tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC CURVE')
plt.show()
from sklearn import tree
plt.figure(figsize=(25,15))
tree.plot_tree(dtc,filled=True)
[Text(0.322017553667263, 0.9722222222222222, 'x[27] <= 0.038\ngini = 0.269\nsamples = 1176\nvalue = [988, 188]'), Text(0.07871198568872988, 0.9166666666666666, 'x[16] <= 0.75\ngini = 0.5\nsamples = 78\nvalue = [39, 39]'), Text(0.046511627906976744, 0.8611111111111112, 'x[4] <= 0.554\ngini = 0.426\nsamples = 39\nvalue = [27, 12]'), Text(0.028622540250447227, 0.8055555555555556, 'x[15] <= 0.167\ngini = 0.312\nsamples = 31\nvalue = [25, 6]'), Text(0.014311270125223614, 0.75, 'x[9] <= 0.5\ngini = 0.49\nsamples = 7\nvalue = [3, 4]'), Text(0.007155635062611807, 0.6944444444444444, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'), Text(0.02146690518783542, 0.6944444444444444, 'x[4] <= 0.143\ngini = 0.375\nsamples = 4\nvalue = [3, 1]'), Text(0.014311270125223614, 0.6388888888888888, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.028622540250447227, 0.6388888888888888, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.04293381037567084, 0.75, 'x[19] <= 0.056\ngini = 0.153\nsamples = 24\nvalue = [22, 2]'), Text(0.03577817531305903, 0.6944444444444444, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.05008944543828265, 0.6944444444444444, 'x[9] <= 0.167\ngini = 0.083\nsamples = 23\nvalue = [22, 1]'), Text(0.04293381037567084, 0.6388888888888888, 'x[23] <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.03577817531305903, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.05008944543828265, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.057245080500894455, 0.6388888888888888, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]'), Text(0.06440071556350627, 0.8055555555555556, 'x[22] <= 0.679\ngini = 0.375\nsamples = 8\nvalue = [2, 6]'), Text(0.057245080500894455, 0.75, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'), Text(0.07155635062611806, 0.75, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.11091234347048301, 0.8611111111111112, 'x[11] <= 0.364\ngini = 0.426\nsamples = 39\nvalue = [12, 27]'), Text(0.09302325581395349, 0.8055555555555556, 'x[0] <= 0.369\ngini = 0.133\nsamples = 14\nvalue = [1, 13]'), Text(0.08586762075134168, 0.75, 'gini = 0.0\nsamples = 13\nvalue = [0, 13]'), Text(0.1001788908765653, 0.75, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.12880143112701253, 0.8055555555555556, 'x[8] <= 0.105\ngini = 0.493\nsamples = 25\nvalue = [11, 14]'), Text(0.11449016100178891, 0.75, 'x[1] <= 0.75\ngini = 0.278\nsamples = 6\nvalue = [5, 1]'), Text(0.1073345259391771, 0.6944444444444444, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.12164579606440072, 0.6944444444444444, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'), Text(0.14311270125223613, 0.75, 'x[15] <= 0.5\ngini = 0.432\nsamples = 19\nvalue = [6, 13]'), Text(0.13595706618962433, 0.6944444444444444, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]'), Text(0.15026833631484796, 0.6944444444444444, 'x[6] <= 0.4\ngini = 0.5\nsamples = 12\nvalue = [6, 6]'), Text(0.13595706618962433, 0.6388888888888888, 'x[32] <= 0.033\ngini = 0.278\nsamples = 6\nvalue = [5, 1]'), Text(0.12880143112701253, 0.5833333333333334, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'), Text(0.14311270125223613, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.16457960644007155, 0.6388888888888888, 'x[8] <= 0.249\ngini = 0.278\nsamples = 6\nvalue = [1, 5]'), Text(0.15742397137745975, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.17173524150268335, 0.5833333333333334, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]'), Text(0.565323121645796, 0.9166666666666666, 'x[21] <= 0.5\ngini = 0.235\nsamples = 1098\nvalue = [949, 149]'), Text(0.31937611806797855, 0.8611111111111112, 'x[29] <= 0.167\ngini = 0.162\nsamples = 798\nvalue = [727, 71]'), Text(0.18604651162790697, 0.8055555555555556, 'x[8] <= 0.445\ngini = 0.38\nsamples = 47\nvalue = [35, 12]'), Text(0.17173524150268335, 0.75, 'x[16] <= 0.75\ngini = 0.1\nsamples = 19\nvalue = [18, 1]'), Text(0.16457960644007155, 0.6944444444444444, 'gini = 0.0\nsamples = 18\nvalue = [18, 0]'), Text(0.17889087656529518, 0.6944444444444444, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.2003577817531306, 0.75, 'x[17] <= 0.094\ngini = 0.477\nsamples = 28\nvalue = [17, 11]'), Text(0.19320214669051877, 0.6944444444444444, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'), Text(0.2075134168157424, 0.6944444444444444, 'x[32] <= 0.6\ngini = 0.413\nsamples = 24\nvalue = [17, 7]'), Text(0.2003577817531306, 0.6388888888888888, 'x[11] <= 0.486\ngini = 0.351\nsamples = 22\nvalue = [17, 5]'), Text(0.19320214669051877, 0.5833333333333334, 'x[24] <= 0.5\ngini = 0.496\nsamples = 11\nvalue = [6, 5]'), Text(0.18604651162790697, 0.5277777777777778, 'x[4] <= 0.036\ngini = 0.408\nsamples = 7\nvalue = [2, 5]'), Text(0.17889087656529518, 0.4722222222222222, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.19320214669051877, 0.4722222222222222, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]'), Text(0.2003577817531306, 0.5277777777777778, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]'), Text(0.2075134168157424, 0.5833333333333334, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]'), Text(0.2146690518783542, 0.6388888888888888, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.4527057245080501, 0.8055555555555556, 'x[30] <= 0.963\ngini = 0.145\nsamples = 751\nvalue = [692, 59]'), Text(0.4455500894454383, 0.75, 'x[30] <= 0.113\ngini = 0.143\nsamples = 750\nvalue = [692, 58]'), Text(0.31440071556350624, 0.6944444444444444, 'x[9] <= 0.167\ngini = 0.218\nsamples = 257\nvalue = [225, 32]'), Text(0.2701252236135957, 0.6388888888888888, 'x[33] <= 0.147\ngini = 0.355\nsamples = 65\nvalue = [50, 15]'), Text(0.24686940966010734, 0.5833333333333334, 'x[33] <= 0.029\ngini = 0.303\nsamples = 59\nvalue = [48, 11]'), Text(0.22182468694096602, 0.5277777777777778, 'x[12] <= 0.5\ngini = 0.463\nsamples = 22\nvalue = [14, 8]'), Text(0.2075134168157424, 0.4722222222222222, 'x[11] <= 0.179\ngini = 0.198\nsamples = 9\nvalue = [8, 1]'), Text(0.2003577817531306, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.2146690518783542, 0.4166666666666667, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]'), Text(0.23613595706618962, 0.4722222222222222, 'x[11] <= 0.4\ngini = 0.497\nsamples = 13\nvalue = [6, 7]'), Text(0.22898032200357782, 0.4166666666666667, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]'), Text(0.24329159212880144, 0.4166666666666667, 'x[4] <= 0.286\ngini = 0.346\nsamples = 9\nvalue = [2, 7]'), Text(0.23613595706618962, 0.3611111111111111, 'x[2] <= 0.369\ngini = 0.444\nsamples = 3\nvalue = [2, 1]'), Text(0.22898032200357782, 0.3055555555555556, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.24329159212880144, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.2504472271914132, 0.3611111111111111, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'), Text(0.27191413237924866, 0.5277777777777778, 'x[15] <= 0.167\ngini = 0.149\nsamples = 37\nvalue = [34, 3]'), Text(0.26475849731663686, 0.4722222222222222, 'x[29] <= 0.5\ngini = 0.5\nsamples = 6\nvalue = [3, 3]'), Text(0.25760286225402507, 0.4166666666666667, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.27191413237924866, 0.4166666666666667, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'), Text(0.27906976744186046, 0.4722222222222222, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]'), Text(0.29338103756708406, 0.5833333333333334, 'x[8] <= 0.065\ngini = 0.444\nsamples = 6\nvalue = [2, 4]'), Text(0.28622540250447226, 0.5277777777777778, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.3005366726296959, 0.5277777777777778, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'), Text(0.35867620751341683, 0.6388888888888888, 'x[0] <= 0.321\ngini = 0.161\nsamples = 192\nvalue = [175, 17]'), Text(0.3220035778175313, 0.5833333333333334, 'x[6] <= 0.1\ngini = 0.294\nsamples = 67\nvalue = [55, 12]'), Text(0.3148479427549195, 0.5277777777777778, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.3291592128801431, 0.5277777777777778, 'x[29] <= 0.5\ngini = 0.26\nsamples = 65\nvalue = [55, 10]'), Text(0.3112701252236136, 0.4722222222222222, 'x[11] <= 0.679\ngini = 0.469\nsamples = 16\nvalue = [10, 6]'), Text(0.3041144901610018, 0.4166666666666667, 'x[4] <= 0.018\ngini = 0.444\nsamples = 9\nvalue = [3, 6]'), Text(0.29695885509839, 0.3611111111111111, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.3112701252236136, 0.3611111111111111, 'x[6] <= 0.4\ngini = 0.245\nsamples = 7\nvalue = [1, 6]'), Text(0.3041144901610018, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.3184257602862254, 0.3055555555555556, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'), Text(0.3184257602862254, 0.4166666666666667, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]'), Text(0.3470483005366726, 0.4722222222222222, 'x[2] <= 0.037\ngini = 0.15\nsamples = 49\nvalue = [45, 4]'), Text(0.33989266547406083, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.3542039355992844, 0.4166666666666667, 'x[2] <= 0.938\ngini = 0.117\nsamples = 48\nvalue = [45, 3]'), Text(0.3470483005366726, 0.3611111111111111, 'x[5] <= 0.875\ngini = 0.081\nsamples = 47\nvalue = [45, 2]'), Text(0.33273703041144903, 0.3055555555555556, 'x[12] <= 0.167\ngini = 0.043\nsamples = 45\nvalue = [44, 1]'), Text(0.32558139534883723, 0.25, 'x[22] <= 0.214\ngini = 0.444\nsamples = 3\nvalue = [2, 1]'), Text(0.3184257602862254, 0.19444444444444445, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.33273703041144903, 0.19444444444444445, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.33989266547406083, 0.25, 'gini = 0.0\nsamples = 42\nvalue = [42, 0]'), Text(0.3613595706618962, 0.3055555555555556, 'x[6] <= 0.9\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.3542039355992844, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.3685152057245081, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.3613595706618962, 0.3611111111111111, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.3953488372093023, 0.5833333333333334, 'x[8] <= 0.022\ngini = 0.077\nsamples = 125\nvalue = [120, 5]'), Text(0.3756708407871199, 0.5277777777777778, 'x[27] <= 0.188\ngini = 0.5\nsamples = 4\nvalue = [2, 2]'), Text(0.3685152057245081, 0.4722222222222222, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.3828264758497317, 0.4722222222222222, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.4150268336314848, 0.5277777777777778, 'x[18] <= 0.968\ngini = 0.048\nsamples = 121\nvalue = [118, 3]'), Text(0.39713774597495527, 0.4722222222222222, 'x[2] <= 0.98\ngini = 0.033\nsamples = 118\nvalue = [116, 2]'), Text(0.3828264758497317, 0.4166666666666667, 'x[14] <= 0.938\ngini = 0.017\nsamples = 114\nvalue = [113, 1]'), Text(0.3756708407871199, 0.3611111111111111, 'gini = 0.0\nsamples = 107\nvalue = [107, 0]'), Text(0.38998211091234347, 0.3611111111111111, 'x[16] <= 0.25\ngini = 0.245\nsamples = 7\nvalue = [6, 1]'), Text(0.3828264758497317, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.39713774597495527, 0.3055555555555556, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]'), Text(0.41144901610017887, 0.4166666666666667, 'x[12] <= 0.833\ngini = 0.375\nsamples = 4\nvalue = [3, 1]'), Text(0.40429338103756707, 0.3611111111111111, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.4186046511627907, 0.3611111111111111, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.4329159212880143, 0.4722222222222222, 'x[27] <= 0.15\ngini = 0.444\nsamples = 3\nvalue = [2, 1]'), Text(0.4257602862254025, 0.4166666666666667, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]'), Text(0.4400715563506261, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.5766994633273703, 0.6944444444444444, 'x[30] <= 0.787\ngini = 0.1\nsamples = 493\nvalue = [467, 26]'), Text(0.5398032200357782, 0.6388888888888888, 'x[15] <= 0.5\ngini = 0.094\nsamples = 486\nvalue = [462, 24]'), Text(0.4874776386404293, 0.5833333333333334, 'x[14] <= 0.938\ngini = 0.154\nsamples = 191\nvalue = [175, 16]'), Text(0.4803220035778175, 0.5277777777777778, 'x[18] <= 0.481\ngini = 0.145\nsamples = 190\nvalue = [175, 15]'), Text(0.46153846153846156, 0.4722222222222222, 'x[33] <= 0.794\ngini = 0.221\nsamples = 95\nvalue = [83, 12]'), Text(0.4543828264758497, 0.4166666666666667, 'x[18] <= 0.47\ngini = 0.207\nsamples = 94\nvalue = [83, 11]'), Text(0.4472271914132379, 0.3611111111111111, 'x[5] <= 0.375\ngini = 0.192\nsamples = 93\nvalue = [83, 10]'), Text(0.42397137745974955, 0.3055555555555556, 'x[6] <= 0.9\ngini = 0.363\nsamples = 21\nvalue = [16, 5]'), Text(0.41681574239713776, 0.25, 'x[17] <= 0.413\ngini = 0.266\nsamples = 19\nvalue = [16, 3]'), Text(0.40250447227191416, 0.19444444444444445, 'x[19] <= 0.056\ngini = 0.117\nsamples = 16\nvalue = [15, 1]'), Text(0.3953488372093023, 0.1388888888888889, 'x[22] <= 0.107\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.3881932021466905, 0.08333333333333333, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.40250447227191416, 0.08333333333333333, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.40966010733452596, 0.1388888888888889, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]'), Text(0.43112701252236135, 0.19444444444444445, 'x[14] <= 0.5\ngini = 0.444\nsamples = 3\nvalue = [1, 2]'), Text(0.42397137745974955, 0.1388888888888889, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.43828264758497315, 0.1388888888888889, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.43112701252236135, 0.25, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.47048300536672627, 0.3055555555555556, 'x[31] <= 0.139\ngini = 0.129\nsamples = 72\nvalue = [67, 5]'), Text(0.4525939177101968, 0.25, 'x[8] <= 0.68\ngini = 0.444\nsamples = 6\nvalue = [4, 2]'), Text(0.44543828264758495, 0.19444444444444445, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]'), Text(0.4597495527728086, 0.19444444444444445, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.4883720930232558, 0.25, 'x[2] <= 0.958\ngini = 0.087\nsamples = 66\nvalue = [63, 3]'), Text(0.4740608228980322, 0.19444444444444445, 'x[28] <= 0.583\ngini = 0.061\nsamples = 64\nvalue = [62, 2]'), Text(0.4669051878354204, 0.1388888888888889, 'gini = 0.0\nsamples = 52\nvalue = [52, 0]'), Text(0.481216457960644, 0.1388888888888889, 'x[3] <= 0.75\ngini = 0.278\nsamples = 12\nvalue = [10, 2]'), Text(0.4740608228980322, 0.08333333333333333, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]'), Text(0.4883720930232558, 0.08333333333333333, 'x[9] <= 0.5\ngini = 0.444\nsamples = 3\nvalue = [1, 2]'), Text(0.481216457960644, 0.027777777777777776, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.49552772808586765, 0.027777777777777776, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.5026833631484794, 0.19444444444444445, 'x[30] <= 0.35\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.49552772808586765, 0.1388888888888889, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.5098389982110912, 0.1388888888888889, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.46153846153846156, 0.3611111111111111, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.46869409660107336, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.4991055456171735, 0.4722222222222222, 'x[19] <= 0.5\ngini = 0.061\nsamples = 95\nvalue = [92, 3]'), Text(0.4919499105545617, 0.4166666666666667, 'gini = 0.0\nsamples = 76\nvalue = [76, 0]'), Text(0.5062611806797853, 0.4166666666666667, 'x[33] <= 0.088\ngini = 0.266\nsamples = 19\nvalue = [16, 3]'), Text(0.4919499105545617, 0.3611111111111111, 'x[15] <= 0.167\ngini = 0.444\nsamples = 3\nvalue = [1, 2]'), Text(0.4847942754919499, 0.3055555555555556, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.4991055456171735, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.5205724508050089, 0.3611111111111111, 'x[17] <= 0.108\ngini = 0.117\nsamples = 16\nvalue = [15, 1]'), Text(0.5134168157423972, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.5277280858676208, 0.3055555555555556, 'gini = 0.0\nsamples = 15\nvalue = [15, 0]'), Text(0.49463327370304117, 0.5277777777777778, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.592128801431127, 0.5833333333333334, 'x[22] <= 0.036\ngini = 0.053\nsamples = 295\nvalue = [287, 8]'), Text(0.5670840787119857, 0.5277777777777778, 'x[32] <= 0.7\ngini = 0.159\nsamples = 46\nvalue = [42, 4]'), Text(0.5599284436493739, 0.4722222222222222, 'x[12] <= 0.167\ngini = 0.124\nsamples = 45\nvalue = [42, 3]'), Text(0.5420393559928444, 0.4166666666666667, 'x[4] <= 0.054\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.5348837209302325, 0.3611111111111111, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.5491949910554562, 0.3611111111111111, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.5778175313059034, 0.4166666666666667, 'x[27] <= 0.688\ngini = 0.089\nsamples = 43\nvalue = [41, 2]'), Text(0.5635062611806798, 0.3611111111111111, 'x[14] <= 0.062\ngini = 0.048\nsamples = 41\nvalue = [40, 1]'), Text(0.556350626118068, 0.3055555555555556, 'x[9] <= 0.167\ngini = 0.375\nsamples = 4\nvalue = [3, 1]'), Text(0.5491949910554562, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.5635062611806798, 0.25, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.5706618962432916, 0.3055555555555556, 'gini = 0.0\nsamples = 37\nvalue = [37, 0]'), Text(0.592128801431127, 0.3611111111111111, 'x[9] <= 0.333\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.5849731663685152, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.5992844364937389, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.5742397137745975, 0.4722222222222222, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.6171735241502684, 0.5277777777777778, 'x[17] <= 0.056\ngini = 0.032\nsamples = 249\nvalue = [245, 4]'), Text(0.5992844364937389, 0.4722222222222222, 'x[16] <= 0.75\ngini = 0.32\nsamples = 5\nvalue = [4, 1]'), Text(0.592128801431127, 0.4166666666666667, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]'), Text(0.6064400715563506, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.6350626118067979, 0.4722222222222222, 'x[2] <= 0.015\ngini = 0.024\nsamples = 244\nvalue = [241, 3]'), Text(0.6207513416815742, 0.4166666666666667, 'x[4] <= 0.875\ngini = 0.278\nsamples = 6\nvalue = [5, 1]'), Text(0.6135957066189625, 0.3611111111111111, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'), Text(0.627906976744186, 0.3611111111111111, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.6493738819320215, 0.4166666666666667, 'x[24] <= 0.167\ngini = 0.017\nsamples = 238\nvalue = [236, 2]'), Text(0.6422182468694096, 0.3611111111111111, 'x[29] <= 0.833\ngini = 0.073\nsamples = 53\nvalue = [51, 2]'), Text(0.627906976744186, 0.3055555555555556, 'x[33] <= 0.088\ngini = 0.041\nsamples = 48\nvalue = [47, 1]'), Text(0.6207513416815742, 0.25, 'x[14] <= 0.312\ngini = 0.245\nsamples = 7\nvalue = [6, 1]'), Text(0.6135957066189625, 0.19444444444444445, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.627906976744186, 0.19444444444444445, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]'), Text(0.6350626118067979, 0.25, 'gini = 0.0\nsamples = 41\nvalue = [41, 0]'), Text(0.6565295169946332, 0.3055555555555556, 'x[0] <= 0.631\ngini = 0.32\nsamples = 5\nvalue = [4, 1]'), Text(0.6493738819320215, 0.25, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]'), Text(0.6636851520572451, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.6565295169946332, 0.3611111111111111, 'gini = 0.0\nsamples = 185\nvalue = [185, 0]'), Text(0.6135957066189625, 0.6388888888888888, 'x[10] <= 0.5\ngini = 0.408\nsamples = 7\nvalue = [5, 2]'), Text(0.6064400715563506, 0.5833333333333334, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.6207513416815742, 0.5833333333333334, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'), Text(0.4598613595706619, 0.75, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.8112701252236136, 0.8611111111111112, 'x[17] <= 0.157\ngini = 0.385\nsamples = 300\nvalue = [222, 78]'), Text(0.7182468694096601, 0.8055555555555556, 'x[26] <= 0.167\ngini = 0.5\nsamples = 96\nvalue = [49, 47]'), Text(0.6815742397137746, 0.75, 'x[4] <= 0.161\ngini = 0.459\nsamples = 42\nvalue = [15, 27]'), Text(0.6565295169946332, 0.6944444444444444, 'x[8] <= 0.415\ngini = 0.499\nsamples = 23\nvalue = [12, 11]'), Text(0.6422182468694096, 0.6388888888888888, 'x[18] <= 0.561\ngini = 0.355\nsamples = 13\nvalue = [3, 10]'), Text(0.6350626118067979, 0.5833333333333334, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]'), Text(0.6493738819320215, 0.5833333333333334, 'x[9] <= 0.333\ngini = 0.48\nsamples = 5\nvalue = [3, 2]'), Text(0.6422182468694096, 0.5277777777777778, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.6565295169946332, 0.5277777777777778, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.6708407871198568, 0.6388888888888888, 'x[3] <= 0.75\ngini = 0.18\nsamples = 10\nvalue = [9, 1]'), Text(0.6636851520572451, 0.5833333333333334, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]'), Text(0.6779964221824687, 0.5833333333333334, 'x[22] <= 0.357\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.6708407871198568, 0.5277777777777778, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.6851520572450805, 0.5277777777777778, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7066189624329159, 0.6944444444444444, 'x[11] <= 0.2\ngini = 0.266\nsamples = 19\nvalue = [3, 16]'), Text(0.6994633273703041, 0.6388888888888888, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7137745974955277, 0.6388888888888888, 'x[27] <= 0.35\ngini = 0.198\nsamples = 18\nvalue = [2, 16]'), Text(0.7066189624329159, 0.5833333333333334, 'x[32] <= 0.433\ngini = 0.111\nsamples = 17\nvalue = [1, 16]'), Text(0.6994633273703041, 0.5277777777777778, 'gini = 0.0\nsamples = 15\nvalue = [0, 15]'), Text(0.7137745974955277, 0.5277777777777778, 'x[31] <= 0.25\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.7066189624329159, 0.4722222222222222, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.7209302325581395, 0.4722222222222222, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7209302325581395, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7549194991055456, 0.75, 'x[0] <= 0.202\ngini = 0.466\nsamples = 54\nvalue = [34, 20]'), Text(0.7352415026833632, 0.6944444444444444, 'x[8] <= 0.164\ngini = 0.245\nsamples = 7\nvalue = [1, 6]'), Text(0.7280858676207513, 0.6388888888888888, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7423971377459749, 0.6388888888888888, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]'), Text(0.774597495527728, 0.6944444444444444, 'x[2] <= 0.622\ngini = 0.418\nsamples = 47\nvalue = [33, 14]'), Text(0.7567084078711985, 0.6388888888888888, 'x[2] <= 0.145\ngini = 0.482\nsamples = 32\nvalue = [19, 13]'), Text(0.7423971377459749, 0.5833333333333334, 'x[2] <= 0.024\ngini = 0.18\nsamples = 10\nvalue = [9, 1]'), Text(0.7352415026833632, 0.5277777777777778, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.7495527728085868, 0.5277777777777778, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]'), Text(0.7710196779964222, 0.5833333333333334, 'x[18] <= 0.87\ngini = 0.496\nsamples = 22\nvalue = [10, 12]'), Text(0.7638640429338104, 0.5277777777777778, 'x[8] <= 0.41\ngini = 0.465\nsamples = 19\nvalue = [7, 12]'), Text(0.7495527728085868, 0.4722222222222222, 'x[18] <= 0.715\ngini = 0.469\nsamples = 8\nvalue = [5, 3]'), Text(0.7423971377459749, 0.4166666666666667, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]'), Text(0.7567084078711985, 0.4166666666666667, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'), Text(0.778175313059034, 0.4722222222222222, 'x[0] <= 0.25\ngini = 0.298\nsamples = 11\nvalue = [2, 9]'), Text(0.7710196779964222, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7853309481216458, 0.4166666666666667, 'x[3] <= 0.25\ngini = 0.18\nsamples = 10\nvalue = [1, 9]'), Text(0.778175313059034, 0.3611111111111111, 'x[11] <= 0.286\ngini = 0.5\nsamples = 2\nvalue = [1, 1]'), Text(0.7710196779964222, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.7853309481216458, 0.3055555555555556, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.7924865831842576, 0.3611111111111111, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]'), Text(0.778175313059034, 0.5277777777777778, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.7924865831842576, 0.6388888888888888, 'x[19] <= 0.944\ngini = 0.124\nsamples = 15\nvalue = [14, 1]'), Text(0.7853309481216458, 0.5833333333333334, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]'), Text(0.7996422182468694, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.9042933810375671, 0.8055555555555556, 'x[16] <= 0.75\ngini = 0.258\nsamples = 204\nvalue = [173, 31]'), Text(0.8479427549194991, 0.75, 'x[17] <= 0.992\ngini = 0.138\nsamples = 147\nvalue = [136, 11]'), Text(0.8407871198568873, 0.6944444444444444, 'x[4] <= 0.482\ngini = 0.128\nsamples = 146\nvalue = [136, 10]'), Text(0.8211091234347049, 0.6388888888888888, 'x[30] <= 0.063\ngini = 0.038\nsamples = 104\nvalue = [102, 2]'), Text(0.813953488372093, 0.5833333333333334, 'x[11] <= 0.193\ngini = 0.32\nsamples = 10\nvalue = [8, 2]'), Text(0.8067978533094812, 0.5277777777777778, 'x[28] <= 0.417\ngini = 0.444\nsamples = 3\nvalue = [1, 2]'), Text(0.7996422182468694, 0.4722222222222222, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.813953488372093, 0.4722222222222222, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.8211091234347049, 0.5277777777777778, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]'), Text(0.8282647584973166, 0.5833333333333334, 'gini = 0.0\nsamples = 94\nvalue = [94, 0]'), Text(0.8604651162790697, 0.6388888888888888, 'x[9] <= 0.167\ngini = 0.308\nsamples = 42\nvalue = [34, 8]'), Text(0.8425760286225402, 0.5833333333333334, 'x[18] <= 0.194\ngini = 0.375\nsamples = 4\nvalue = [1, 3]'), Text(0.8354203935599285, 0.5277777777777778, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.8497316636851521, 0.5277777777777778, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'), Text(0.8783542039355993, 0.5833333333333334, 'x[0] <= 0.393\ngini = 0.229\nsamples = 38\nvalue = [33, 5]'), Text(0.8640429338103757, 0.5277777777777778, 'x[13] <= 0.375\ngini = 0.5\nsamples = 6\nvalue = [3, 3]'), Text(0.8568872987477638, 0.4722222222222222, 'x[22] <= 0.036\ngini = 0.375\nsamples = 4\nvalue = [3, 1]'), Text(0.8497316636851521, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.8640429338103757, 0.4166666666666667, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'), Text(0.8711985688729875, 0.4722222222222222, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.8926654740608229, 0.5277777777777778, 'x[8] <= 0.992\ngini = 0.117\nsamples = 32\nvalue = [30, 2]'), Text(0.8855098389982111, 0.4722222222222222, 'x[28] <= 0.917\ngini = 0.062\nsamples = 31\nvalue = [30, 1]'), Text(0.8783542039355993, 0.4166666666666667, 'gini = 0.0\nsamples = 30\nvalue = [30, 0]'), Text(0.8926654740608229, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.8998211091234347, 0.4722222222222222, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.855098389982111, 0.6944444444444444, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]'), Text(0.960644007155635, 0.75, 'x[14] <= 0.812\ngini = 0.456\nsamples = 57\nvalue = [37, 20]'), Text(0.9355992844364938, 0.6944444444444444, 'x[32] <= 0.4\ngini = 0.238\nsamples = 29\nvalue = [25, 4]'), Text(0.9212880143112702, 0.6388888888888888, 'x[8] <= 0.071\ngini = 0.142\nsamples = 26\nvalue = [24, 2]'), Text(0.9141323792486583, 0.5833333333333334, 'x[10] <= 0.5\ngini = 0.444\nsamples = 3\nvalue = [1, 2]'), Text(0.9069767441860465, 0.5277777777777778, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.9212880143112702, 0.5277777777777778, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.9284436493738819, 0.5833333333333334, 'gini = 0.0\nsamples = 23\nvalue = [23, 0]'), Text(0.9499105545617174, 0.6388888888888888, 'x[2] <= 0.324\ngini = 0.444\nsamples = 3\nvalue = [1, 2]'), Text(0.9427549194991055, 0.5833333333333334, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.9570661896243292, 0.5833333333333334, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.9856887298747764, 0.6944444444444444, 'x[32] <= 0.1\ngini = 0.49\nsamples = 28\nvalue = [12, 16]'), Text(0.9785330948121646, 0.6388888888888888, 'x[12] <= 0.833\ngini = 0.48\nsamples = 20\nvalue = [12, 8]'), Text(0.9713774597495528, 0.5833333333333334, 'x[4] <= 0.018\ngini = 0.415\nsamples = 17\nvalue = [12, 5]'), Text(0.964221824686941, 0.5277777777777778, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]'), Text(0.9785330948121646, 0.5277777777777778, 'x[17] <= 0.365\ngini = 0.32\nsamples = 15\nvalue = [12, 3]'), Text(0.9713774597495528, 0.4722222222222222, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]'), Text(0.9856887298747764, 0.4722222222222222, 'x[18] <= 0.702\ngini = 0.375\nsamples = 4\nvalue = [1, 3]'), Text(0.9785330948121646, 0.4166666666666667, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'), Text(0.9928443649373881, 0.4166666666666667, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'), Text(0.9856887298747764, 0.5833333333333334, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'), Text(0.9928443649373881, 0.6388888888888888, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]')]
from sklearn.model_selection import GridSearchCV
parameter={
'criterion':['gini','entropy'],
'splitter':['best','random'],
'max_depth':[1,2,3,4,5],
'max_features':['auto', 'sqrt', 'log2']
}
grid_search=GridSearchCV(estimator=dtc,param_grid=parameter,cv=5,scoring="accuracy")
grid_search.fit(x_train,y_train)
C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
100 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
estimator._validate_params()
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
validate_parameter_constraints(
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of DecisionTreeClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [ nan nan 0.84013704 0.84013704 0.84013704 0.84013704
nan nan 0.84268662 0.84352687 0.84098449 0.84013704
nan nan 0.83928597 0.8409881 0.83587811 0.83843491
nan nan 0.84182113 0.82994591 0.83247025 0.83671836
nan nan 0.82056978 0.83333213 0.83755499 0.83163361
nan nan 0.84013704 0.84013704 0.84013704 0.84013704
nan nan 0.84013704 0.84013704 0.84013704 0.83928958
nan nan 0.83928958 0.84354129 0.83335016 0.84269383
nan nan 0.83928958 0.83759827 0.82994951 0.83929318
nan nan 0.83758024 0.82312297 0.84099892 0.83163361]
warnings.warn(
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5],
'max_features': ['auto', 'sqrt', 'log2'],
'splitter': ['best', 'random']},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [1, 2, 3, 4, 5],
'max_features': ['auto', 'sqrt', 'log2'],
'splitter': ['best', 'random']},
scoring='accuracy')DecisionTreeClassifier()
DecisionTreeClassifier()
grid_search.best_params_
{'criterion': 'entropy',
'max_depth': 3,
'max_features': 'sqrt',
'splitter': 'random'}
dtc_cv=DecisionTreeClassifier(criterion= 'entropy',
max_depth=3,
max_features='sqrt',
splitter='best')
dtc_cv.fit(x_train,y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='sqrt')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(criterion='entropy', max_depth=3, max_features='sqrt')
pred=dtc_cv.predict(x_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.85 0.99 0.91 245
1 0.71 0.10 0.18 49
accuracy 0.84 294
macro avg 0.78 0.55 0.55 294
weighted avg 0.82 0.84 0.79 294
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
forest_params = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,14))}]
rfc_cv= GridSearchCV(rfc,param_grid=forest_params,cv=10,scoring="accuracy")
rfc_cv.fit(x_train,y_train)
C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
50 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
estimator._validate_params()
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
validate_parameter_constraints(
File "C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 0 instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Manu\OneDrive\Documents\Anaconda\Lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [ nan 0.84949297 0.85798204 0.85459221 0.85798204 0.85969144
0.86052441 0.86308127 0.85541793 0.85457772 0.85712734 0.85627988
0.8605389 0.85627264 nan 0.84694336 0.85205708 0.85883674
0.85969144 0.85882225 0.85798928 0.85882949 0.85798204 0.86052441
0.86479067 0.85966971 0.85458496 0.85966247 nan 0.84864552
0.85544691 0.85800377 0.85882949 0.85796755 0.85967695 0.86138635
0.86052441 0.8596842 0.86054614 0.85798928 0.85882225 0.85796755
nan 0.84779082 0.85714907 0.85885122 0.86051717 0.86052441
0.85881501 0.85882225 0.86223381 0.85712734 0.85883674 0.86137911
0.85540345 0.85627264 nan 0.84864552 0.85375199 0.85544691
0.85628712 0.8596842 0.85627988 0.8622483 0.86137187 0.85796031
0.85881501 0.85627264 0.86051717 0.85541793]
warnings.warn(
GridSearchCV(cv=10, estimator=RandomForestClassifier(),
param_grid=[{'max_depth': [10, 11, 12, 13, 14],
'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13]}],
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=RandomForestClassifier(),
param_grid=[{'max_depth': [10, 11, 12, 13, 14],
'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13]}],
scoring='accuracy')RandomForestClassifier()
RandomForestClassifier()
pred=rfc_cv.predict(x_test)
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.85 0.98 0.91 245
1 0.67 0.16 0.26 49
accuracy 0.85 294
macro avg 0.76 0.57 0.59 294
weighted avg 0.82 0.85 0.81 294
rfc_cv.best_params_
{'max_depth': 11, 'max_features': 10}
confusion_matrix(y_test,pred)
array([[241, 4],
[ 41, 8]], dtype=int64)
pd.crosstab(y_test,pred)
| col_0 | 0 | 1 |
|---|---|---|
| Attrition | ||
| 0 | 241 | 4 |
| 1 | 41 | 8 |
a3 = (240+11)/294 #accuracy
a3
0.8537414965986394
r3 = 11/(11+38) # recall
r3
0.22448979591836735
p3 = 11/(5+11) #precision
p3
0.6875
f3 = 2*r3*p3/(r3+p3) #E1 score
f3
0.3384615384615384
print(classification_report(y_test,pred))
precision recall f1-score support
0 0.85 0.98 0.91 245
1 0.67 0.16 0.26 49
accuracy 0.85 294
macro avg 0.76 0.57 0.59 294
weighted avg 0.82 0.85 0.81 294